{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "af29aa8d",
   "metadata": {},
   "source": [
    "# 数据集的划分\n",
    "本次数据集仅需划分为70%的训练集和30%的测试集，在TensorFlow中提供了划分训练集和验证集的方式，故此处仅将全部数据集划分为两部分，后续从训练集中再划分出训练集和验证集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a4a4f3af",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import random\n",
    "import shutil\n",
    "\n",
    "\n",
    "def DivideDataSet(fileDir, class_name):\n",
    "    imgpath=os.path.join(fileDir,class_name)#按顺序读取到第一类照片名字\n",
    "    image_list = os.listdir(imgpath)  # 获取原始图片路径中的所有图片\n",
    "    image_number = len(image_list)    # 获取原始图片路径中的图片数目\n",
    "    train_number = int(image_number * train_rate) # 图像数量乘以随机比例得出需要多少张训练图像\n",
    "    test_number = image_number - train_number    # 剩下的做验证集图像\n",
    "    print(class_name, \"文件夹下有\", image_number, \"张图片，分为\", train_number, \"张训练集和\", test_number, \"张测试集\")\n",
    "    \n",
    "    train_sample = random.sample(image_list, train_number)   # 从image_list中随机获取一定比例的图像.\n",
    "#     test_sample = random.sample(list(set(image_list) - set(train_sample)), test_number)\n",
    "#     val_sample = list(set(image_list ) - set(train_sample) - set(test_sample))\n",
    "    test_sample = list(set(image_list) - set(train_sample))  # 仅划分测试集和训练集时注释上面两行\n",
    "\n",
    "    sample = [train_sample, test_sample] # 生成列表\n",
    "    # 复制图像到目标文件夹\n",
    "    for k in range(len(save_dir)): # 地址长度，目前k是两个数，0和1\n",
    "        tmp_dir = os.path.join(save_dir[k],class_name)\n",
    "#         print(tmp_dir)\n",
    "        if os.path.isdir(tmp_dir):# 判断路径是否存在\n",
    "            for name in sample[k]:# sample[0]为train_sample中的数据，整句是train_sample中的数据循序进行遍历\n",
    "                shutil.copy(os.path.join(imgpath, name), os.path.join(tmp_dir, name))# join的作用是拼接字符串\n",
    "        else:\n",
    "            os.makedirs(tmp_dir)#建立图像路径\n",
    "#             print(tmp_dir)\n",
    "            for name in sample[k]:\n",
    "                shutil.copy(os.path.join(imgpath, name), os.path.join(tmp_dir, name))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "87d834a6",
   "metadata": {},
   "source": [
    "程序运行前需要手动将原始数据集的cloud，cloudy，sun的图片放到DataSet的对应目录下，由于划分完全随机，故此程序仅运行一次，不可多次运行，否则会出现训练集和测试集数据混杂，影响最终效果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9ec08e7a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cloud 文件夹下有 2382 张图片，分为 1667 张训练集和 715 张测试集\n",
      "cloudy 文件夹下有 4008 张图片，分为 2805 张训练集和 1203 张测试集\n",
      "sun 文件夹下有 5413 张图片，分为 3789 张训练集和 1624 张测试集\n",
      "划分完毕!\n",
      "---------------\n",
      "训练集和测试集划分共耗时 11.4973 s!\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "time_start = time.time()\n",
    "\n",
    "# 训练集比例\n",
    "train_rate = 0.7\n",
    "test_rate = 0.3\n",
    "\n",
    "# 原始数据集路径\n",
    "origion_path = './DataSet(resized)/'\n",
    "\n",
    "# 保存路径\n",
    "save_train_dir = './TrainDataSet(resized)/'#训练集图片地址\n",
    "save_test_dir = './TestDataSet(resized)/'#测试集图片地址\n",
    "save_val_dir = ''  # 验证集图片地址\n",
    "\n",
    "save_dir = [save_train_dir, save_test_dir]\n",
    "\n",
    "\n",
    "# 数据集类别及数量\n",
    "file_list = os.listdir(origion_path)\n",
    "num_classes = len(file_list)\n",
    "for i in range(num_classes):\n",
    "    class_name = file_list[i]\n",
    "    DivideDataSet(origion_path, class_name)\n",
    "print('划分完毕!')\n",
    "time_end = time.time()\n",
    "print('---------------')\n",
    "print('训练集和测试集划分共耗时 %.4f s!' % (time_end - time_start))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dfa85155",
   "metadata": {},
   "source": [
    "输出TrainDataSet和TrainDataSet下的cloud文件夹前十张验证一下"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "79f353b1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "训练集图片: 1667\n",
      "cloud (1).jpeg\n",
      "cloud (10).jpeg\n",
      "cloud (100).jpeg\n",
      "cloud (1001).jpeg\n",
      "cloud (1003).jpeg\n",
      "cloud (1005).jpeg\n",
      "cloud (1008).jpeg\n",
      "cloud (1009).jpeg\n",
      "cloud (101).jpeg\n",
      "cloud (1012).jpeg\n",
      "-------------------\n",
      "测试集图片: 715\n",
      "cloud (1000).jpeg\n",
      "cloud (1002).jpeg\n",
      "cloud (1004).jpeg\n",
      "cloud (1006).jpeg\n",
      "cloud (1007).jpeg\n",
      "cloud (1010).jpeg\n",
      "cloud (1011).jpeg\n",
      "cloud (1019).jpeg\n",
      "cloud (1021).jpeg\n",
      "cloud (1023).jpeg\n"
     ]
    }
   ],
   "source": [
    "train_list = os.listdir(os.path.join(save_train_dir, 'cloud'))\n",
    "print(\"训练集图片:\",len(train_list))\n",
    "for i in range(10):\n",
    "    print(train_list[i])\n",
    "print(\"-------------------\")\n",
    "test_list = os.listdir(os.path.join(save_test_dir, 'cloud'))\n",
    "print(\"测试集图片:\",len(test_list))\n",
    "for i in range(10):\n",
    "    print(test_list[i])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5ff853c6",
   "metadata": {},
   "source": [
    "此处的顺序为字符顺序而非数字顺序，但仔细看还是能分辨出来原本相邻的图片被按比例随机分给了两个文件夹"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
